library(tidyverse)
library(dplyr)
library(readxl)
#library(ggmap)
#library(randomForest)
library(eeptools)
library(stringr)
library(lubridate)
library(weights)
library(survey)
#library(srvy)
library(stargazer)
#library(gtsummary)
library(data.table)
library(anesrake)
library(estimatr)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(lmtest)
#library(effects)
#library(multiwayvcov)
library(rddtools)
library(magrittr)
library(stargazer)

# Weighting code based in part on https://zacharylhertz.github.io/posts/2022/05/weighting-surveys

setwd('/Users/celinascott-buechler/DFP/dac_survey/Replication/Analysis/National Survey/Data')

dac_conjoint <- read_csv('NEWEST_DAC_survey_only.csv', col_names = TRUE)

dac_conjoint <- dac_conjoint[-1, ]

dac_conjoint <- dac_conjoint[, !colnames(dac_conjoint) %in%
                               c('StartDate', 'EndDate', 'Status')]

#Keep only respondents who completed the survey and passed the simple comprehension check
dac_conjoint <- filter(dac_conjoint, FinishedSurvey == 1 & Complete == 1 & AttentionCheck =='Desk')

#Keep only respondents who completed the survey and passed the simple comprehension check
#dac_conjoint <- filter(dac_conjoint, FinishedSurvey == 1 & Complete == 1 & AttentionCheck =='Desk' & ComprehensionCheck=="Direct air capture (DAC) removes carbon dioxide emissions from the atmosphere.")

#Remove fastest 5% of survey respondents
dac_conjoint$`Duration (in seconds)` <- as.numeric(dac_conjoint$`Duration (in seconds)`)
length(dac_conjoint$`Duration (in seconds)`)*0.05 #exclude 63 responses
dac_conjoint <- dac_conjoint %>% arrange(desc(`Duration (in seconds)`)) %>% slice(1:(n()-63))

#Calculate ages
dac_conjoint$BirthDate <- str_c(dac_conjoint$YearBorn, '-', dac_conjoint$MonthBorn, '-', dac_conjoint$DayBorn) %>%
  as_date(.)
dac_conjoint$Age <- age_calc(na.omit(dac_conjoint$BirthDate),units='years', precise = FALSE)

dac_conjoint <- dac_conjoint[, !colnames(dac_conjoint) %in% c('MonthBorn', 'DayBorn', 'YearBorn')]

dac_conjoint <- dac_conjoint %>% filter(Age>17)

#Combine push answers with their original questions
dac_conjoint <- dac_conjoint%>%
  mutate(IndustryPromisesPush=ifelse(is.na(IndustryPromisesPush), IndustryPromises,IndustryPromisesPush))%>%
  mutate(JobsNeedPush=ifelse(is.na(JobsNeedPush), JobsNeed,JobsNeedPush))

dac_conjoint <- dac_conjoint[, !colnames(dac_conjoint) %in% c('JobsNeed', 'IndustryPromises')]

dac_conjoint <- dac_conjoint %>% rename('JobsNeed' = 'JobsNeedPush',
                                        'IndustryPromises' = 'IndustryPromisesPush')

#Create regions based on states
west <- c('Arizona','Washington', 'Oregon', 'California', 'Nevada', 'Utah', 'Idaho', 'Montana', 'Wyoming', 'Colorado', 'Alaska', 'Hawaii', 'American Samoa')
south <- c('New Mexico', 'Texas','Oklahoma', 'Arkansas','Louisiana', 'Mississippi', 'Alabama', 'Georgia', 'Tennessee', 'Kentucky', 'West Virginia', 'Florida',
           'South Carolina', 'North Carolina', 'Delaware', 'District of Columbia', 'Virginia', 'Maryland')
midwest <- c('North Dakota', 'South Dakota','Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri', 'Wisconsin',
             'Illinois', 'Indiana', 'Ohio', 'Michigan')
northeast <- c('Pennsylvania', 'New Jersey', 'Connecticut', 'Rhode Island', 'New York', 'Vermont', 'Maine', 'New Hampshire','Massachusetts')

#Age categories
dac_conjoint <- dac_conjoint %>%
  mutate(
    # Create categories
    AgeGroup = dplyr::case_when(
      Age > 17 & Age <= 29 ~ "18-29",
      Age > 29 & Age <= 44 ~ "30-44",
      Age > 44 & Age <= 54 ~ "45-54",
      Age > 54 & Age <= 64 ~ "55-64",
      Age > 64             ~ "65+"
    ),
    # Convert to factor
    AgeGroup = factor(
      AgeGroup,
      level = c("18-29", "30-44","45-54", "55-64", "65+")
    )
  )

dac_conjoint$Region[dac_conjoint$State %in% west] <- 'West'
dac_conjoint$Region[dac_conjoint$State %in% south] <- 'South'
dac_conjoint$Region[dac_conjoint$State %in% midwest] <- 'Midwest'
dac_conjoint$Region[dac_conjoint$State %in% northeast] <- 'Northeast'

#Raking

PUS_a <- fread("csv_pus/psam_pusa.csv",
               select= c("HISP", "AGEP", "REGION", "DIVISION",
                         "RAC1P", "SCHL", "SEX", "PWGTP"))
PUS_b <- fread("csv_pus/psam_pusb.csv",
               select= c("HISP", "AGEP", "REGION", "DIVISION",
                         "RAC1P", "SCHL", "SEX", "PWGTP"))

acs <- rbind(PUS_a, PUS_b)
#rm(PUS_a,PUS_b)
acs <- subset(acs, acs$AGEP>17)

# Recode variables
acs$hispanic[acs$HISP!=1] <- 1  #Hispanic
acs$hispanic[acs$HISP==1] <- 2 #Nonhispanic

acs$sex[acs$SEX==1] <- 1 #Male
acs$sex[acs$SEX==2] <- 2 #Female

acs$region[acs$REGION==1] <- 1 # Northeast
acs$region[acs$REGION==2] <- 2 # Midwest
acs$region[acs$REGION==3] <- 3 # South
acs$region[acs$REGION==4] <- 4 # West

acs$age5[acs$AGEP<30] <- 1 # 18-29
acs$age5[acs$AGEP>29 & acs$AGEP<45] <- 2 # 30-44
acs$age5[acs$AGEP>=45 & acs$AGEP<55] <- 3 # 45-54
acs$age5[acs$AGEP>=55 & acs$AGEP<65] <- 4 # 55-64
acs$age5[acs$AGEP>64] <- 5 # 65+

acs$educ4[acs$SCHL<17] <- 1 # HS or less
acs$educ4[acs$SCHL>16 & acs$SCHL<20] <- 2 # Some college
acs$educ4[acs$SCHL==20] <- 3 # College degree
acs$educ4[acs$SCHL>20] <- 4 # Post-graduate degree

acs$race[acs$RAC1P==1] <- 1 #White alone
acs$race[acs$RAC1P==2] <- 2 #Black alone"
acs$race[acs$RAC1P>2 &
           acs$RAC1P<6] <- 3 #American Indian or Alaskan Native
acs$race[acs$RAC1P==6] <- 4 #Asian alone
acs$race[acs$RAC1P>6] <- 5 #Native Hawaiian and Other Pacific Islander & Other

# Create survey design object using ACS and weights
svy.acs <- svydesign(ids=~1, data=acs, weights=acs$PWGTP)

# Creating targets
Gender_num <- svytable(~sex, design=svy.acs) %>%
  prop.table() %>%
  round(digits=3) %>%
  as.numeric() # Male: 49%, Female: 51%

Region_num <- svytable(~region, design=svy.acs) %>%
  prop.table() %>%
  round(digits=3) %>%
  as.numeric() # NE: 17.6%, MW: 20.7%, S: 37.8%, W: 23.6%

Age_num <- svytable(~age5, design=svy.acs) %>%
  prop.table() %>%
  round(digits=3) %>%
  as.numeric() # 18-29: 20.2%, 30-44: 25.9%, 45-54: 16.6%, 55-64: 16.6%, 65+:20.2%

Education_num <- svytable(~educ4, design=svy.acs) %>%
  prop.table() %>%
  round(digits=3) %>%
  as.numeric() # HS or less: 34.3%, Some College: 24.9%, College: 8.4%, Postgraduate: 29.5%

Race_num <- svytable(~race, design=svy.acs) %>%
  prop.table() %>%
  round(digits=3) %>%
  as.numeric() # White: 61.0%, Black: 11.6%, Hispanic: 16.8%, Asian: 6.0%, Other: 4.6%
#White: 63.6%, Black: 11.8%, AI/AN: 0.9%, Asian: 6.0%, Hawaiian or Pacific Islander: 0.2%, Some other alone: 6.7%, Two or more: 10.9% -> 1.001

Hispanic_num <- svytable(~hispanic, design=svy.acs) %>%
  prop.table() %>%
  round(digits=3) %>%
  as.numeric() # Hispanic: 16.9%, Non-hispanic: 83.1%


dac_conjoint$Hispanic_num[dac_conjoint$Hispanic=='I am of Hispanic or Latino descent'] <- 1  #Hispanic
dac_conjoint$Hispanic_num[dac_conjoint$Hispanic=='I am not of Hispanic or Latino descent'] <- 2 #Nonhispanic

dac_conjoint$Gender_num[dac_conjoint$Gender=='Male'] <- 1 #Male
dac_conjoint$Gender_num[dac_conjoint$Gender=='Female'] <- 2 #Female

dac_conjoint$Region_num[dac_conjoint$Region=='Northeast'] <- 1 # Northeast
dac_conjoint$Region_num[dac_conjoint$Region=='Midwest'] <- 2 # Midwest
dac_conjoint$Region_num[dac_conjoint$Region=='South'] <- 3 # South
dac_conjoint$Region_num[dac_conjoint$Region=='West'] <- 4 # West

dac_conjoint$Age_num[dac_conjoint$Age<30] <- 1 # 18-29
dac_conjoint$Age_num[dac_conjoint$Age>29 & dac_conjoint$Age<45] <- 2 # 30-44
dac_conjoint$Age_num[dac_conjoint$Age>=45 & dac_conjoint$Age<55] <- 3 # 45-54
dac_conjoint$Age_num[dac_conjoint$Age>=55 & dac_conjoint$Age<65] <- 4 # 55-64
dac_conjoint$Age_num[dac_conjoint$Age>64] <- 5 # 65+

dac_conjoint$Education_num[dac_conjoint$Education=="No high school diploma" |
                             dac_conjoint$Education=="High school diploma or equivalent"] <- 1 # HS or less
dac_conjoint$Education_num[dac_conjoint$Education=="Some college, but no degree"] <- 2 # Some college
dac_conjoint$Education_num[dac_conjoint$Education=="Associate's degree" |
                             dac_conjoint$Education== "Bachelor's degree" ] <- 3 # College degree
dac_conjoint$Education_num[dac_conjoint$Education=="Advanced degree (such as Master's, Professional, or Doctorate degree)"] <- 4 # Post-graduate degree

dac_conjoint$Race_num[dac_conjoint$Race=='White'] <- 1 #White alone
dac_conjoint$Race_num[dac_conjoint$Race=="Black or African American" ] <- 2 #Black alone"
dac_conjoint$Race_num[dac_conjoint$Race=="American Indian or Alaska Native"] <- 3 #American Indian or Alaskan Native
dac_conjoint$Race_num[dac_conjoint$Race=="Asian"] <- 4 #Asian alone
dac_conjoint$Race_num[dac_conjoint$Race=="Other race"|
                        dac_conjoint$Race=='Hispanic or Latino/a'] <- 5 #Native Hawaiian, Pacific Islander, Hispanic non-white & Other

dac_conjoint_df<- as.data.frame(dac_conjoint)

targets <- list(Gender_num,  Age_num, Region_num, Education_num, Race_num, Hispanic_num)
# remember, these names will have to match
names(targets) <- c('Gender_num',  'Age_num', 'Region_num', 'Education_num', 'Race_num', 'Hispanic_num')

dac_conjoint_df$Gender_num <- as.factor(dac_conjoint_df$Gender_num)
dac_conjoint_df$Hispanic_num <- as.factor(dac_conjoint_df$Hispanic_num)
dac_conjoint_df$Age_num <- as.factor(dac_conjoint_df$Age_num)
dac_conjoint_df$Education_num <- as.factor(dac_conjoint_df$Education_num)
dac_conjoint_df$Race_num <- as.factor(dac_conjoint_df$Race_num)

names(targets$Hispanic_num) <- levels(dac_conjoint_df$Hispanic_num)
names(targets$Gender_num) <- levels(dac_conjoint_df$Gender_num)
names(targets$Region_num) <- levels(dac_conjoint$Region_num)
names(targets$Age_num) <- levels(dac_conjoint_df$Age_num)
names(targets$Education_num) <- levels(dac_conjoint_df$Education_num)
names(targets$Race_num) <- levels(dac_conjoint_df$Race_num)


# Create the weights
weights <- anesrake(targets, dac_conjoint_df,
                    caseid = dac_conjoint_df$ResponseId)

# Store the weights as a variable
dac_conjoint_df$nationalweight  <- unlist(weights[1])

j_national <-svydesign(ids = ~1, data = dac_conjoint_df, weights = dac_conjoint_df$nationalweight)

#Write file for cleaned data
write.csv(dac_conjoint_df, 'dac_cleaned_final.csv', row.names = TRUE)

summary(dac_conjoint_df)
length(dac_conjoint_df$Progress)
